import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
from scipy.spatial.distance import cdist
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
import sklearn.preprocessing as pp
from sklearn.preprocessing import normalize
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_tree
from xgboost import plot_importance
from numpy import loadtxt
from numpy import sort
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import shap
import random
sns.set(style="ticks")
%matplotlib inline
# Display the data dictionary describing our variables.
Dict = pd.read_csv('Data Dictionary.csv')
display(Dict)
# Reading csv files using pandas
# Setting out the first column as index
Data = pd.read_csv('cs-training.csv', index_col=0)
# Load the CSV file into a Pandas dataframe
DataDF = pd.DataFrame(Data)
print ('Our Data has ' + str(DataDF.shape[0]) + ' observations & ' + str(DataDF.shape[1]) + ' Labels.')
DataDF.head()
# Feature names
DataDF.columns
#Some Feature names will cause us problems as they contain arithmetic operation (-), also will shorten the others
#so first thing will change our column names
DataDF.rename(columns={'NumberOfTime30-59DaysPastDueNotWorse':'T30DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse':'T60DaysLate', 'NumberOfTimes90DaysLate' : 'T90DaysLate',
'RevolvingUtilizationOfUnsecuredLines': 'Revolving','NumberOfOpenCreditLinesAndLoans': 'OpenCredit', 'NumberRealEstateLoansOrLines': 'RealEstate' }, inplace=True)
# Understanding our data for types and description
DataDF.info()
# Statistical info to help us understand our dataset
DataDF.describe()
The data provided are taken from real-world sources and we expect the data to have errors. From my observations there are missing value & typo errors from when the data was entered and some values that were coded and doesn’t reflect the true meaning as the rest of the data.
# Dropping any duplicates in the data
DataDF = DataDF.drop_duplicates()
# Finding the shape of our data
print ('Our Data has ' + str(DataDF.shape[0]) + ' observations & ' + str(DataDF.shape[1]) + ' Labels.')
print (DataDF.shape[0] - DataDF.count())
# As shown above the missing observations in our data as following MonthlyIncome:29221, NumberOfDependents:3828
print ('The percentage of missing values in NumberOfDependents columns is ' + str((3828/150000)* 100) + ' %.')
print ('The percentage of missing values in MonthlyIncome columns is ' + str((29221/150000)* 100) + ' %.')
# The MonthlyIncome is tricky but for now i will fill the missing values with median.
DataDF['MonthlyIncome'].fillna(DataDF['MonthlyIncome'].median(), inplace=True)
sum(DataDF['MonthlyIncome'].isnull())
# My initial analysis is the missing values in NumberOfDependents or NAN values might be because
# it might not be applicable for the borrower so i will fill the missing values here with 0 or mode.
DataDF['NumberOfDependents'].fillna(DataDF['NumberOfDependents'].mode()[0], inplace=True)
sum(DataDF['NumberOfDependents'].isnull())
# Lets make sure we dont have missing values in our data
print (DataDF.shape[0] - DataDF.count())
#Corr() is a pandas option which produces a data frame of all correlation coefficients for each pairwise pair.
#Collinearity help us to reduce the number of variables that can explain our target variable,
#it can initially be inspected through studying pairwise correlation between features using a correlation matrix.
plt.figure(figsize=(10,8))
sns.heatmap(DataDF.corr(), annot=True, fmt=".2f", cmap='viridis')
plt.title('Pearson Correlation Plot')
plt.xticks(rotation=55)
# Bug Fixed for top and bottom cropped
b, t = plt.ylim()
b += 0.4
t -= 0.4
plt.ylim(b, t)
plt.show()
Comment:
$T90DaysLate$, $ T60DaysLate$ and $T030DaysLate$ are highly correlated.
At this stage we are exploring our variables, but we will deal with cleaning the data, detecting outliers, driving new features.
Therefore, we will return to Pearson correlation in a later stage which will be applied to logs transformation.
# Here will explore the distrubutions of all our features
col_num = DataDF.shape[1]
fig, axis = plt.subplots(3, 4, figsize=(18, 10))
i = 0
for row in axis:
for col in row:
if (i<col_num):
DataDF.hist(column = DataDF.columns[i], bins = 20, ax=col)
i = i+1
# Here will use pandas skew to find the skewness of our features
num_feats = DataDF.dtypes[DataDF.dtypes!='object'].index
skew_feats=DataDF[num_feats].skew().sort_values(ascending=False)
skewness=pd.DataFrame({'Skew':skew_feats})
skewness
Comment:
A positive skewness indicates an asymmetry in the distribution towards the right-hand side of the distribution. The distribution is highly skewed for the all variables except age which has a near zero distribution indicating a near normal distribution. We will apply a log transformation for all our variables except the target to lessen the skewness.
# Age
# Lets see how many borrowers are less than 18 which is the the legal age of borrowing.
Count_legal_age = len(DataDF.loc[DataDF["age"] < 18, "age"])
print("Total number of borrowers under the legal age {} is {}".format(18, Count_legal_age))
# Replacing the zero age into the median of age column.
DataDF.loc[DataDF["age"] == 0, "age"] = DataDF.age.median()
# Number Of Dependents
# Looking at the stats above i noticed NumberOfDependents with a min of zero and a max of 20.
Dependents = len(DataDF.loc[DataDF["NumberOfDependents"] > 10, "NumberOfDependents"])
print("Total number of dependents above 10 is {}".format(Dependents))
# Number Of Dependents
# We have twp observation with 13 and 20 independents which looks like typo error and should be removed from our data.
DataDF.drop(DataDF[DataDF['NumberOfDependents'] == 13].index, inplace = True)
DataDF.drop(DataDF[DataDF['NumberOfDependents'] == 20].index, inplace = True)
# Applying log on all features and creating new columns to compare later.
# Worthy to note here that i added a fixed constant 1 inside the log, This works very well as the log of 1 is 0 as it could help with errors.
DataDF['Revolving_log'] = np.log(DataDF.Revolving + 1)
DataDF['age_log'] = np.log(DataDF.age + 1)
DataDF['30Days_log'] = np.log(DataDF.T30DaysLate + 1)
DataDF['DebtRatio_log'] = np.log(DataDF.DebtRatio + 1)
DataDF['MonthlyIncome_log'] = np.log(DataDF.MonthlyIncome + 1)
DataDF['OpenCredit_log'] = np.log(DataDF.OpenCredit + 1)
DataDF['RealEstate_log'] = np.log(DataDF.RealEstate + 1)
DataDF['Dependents_log'] = np.log(DataDF.NumberOfDependents + 1)
DataDF['60Days_log'] = np.log(DataDF.T60DaysLate + 1)
DataDF['90Days_log'] = np.log(DataDF.T90DaysLate + 1)
# Apply normalization, we will create different subset later for testing
# Normalization function which basically transform the observation to between 0 and 1.
# The log function plus normalization is a good way to transform skewed data.
# Min-Max normalization function
#def normalize(column):
# upper = column.max()
# lower = column.min()
# y = (column - lower)/(upper-lower)
# return y
# Using our function above we can normalize our features using min max
#DataDF['RevolvingUtilizationOfUnsecuredLines_log_normalized'] = normalize(DataDF.RevolvingUtilizationOfUnsecuredLines_log)
#DataDF['age_log_normalized'] = normalize(DataDF.age_log)
#DataDF['NumberOfTimes30DaysLate_log_normalized'] = normalize(DataDF.NumberOfTimes30DaysLate_log)
#DataDF['DebtRatio_log_normalized'] = normalize(DataDF.DebtRatio_log)
#DataDF['MonthlyIncome_log_normalized'] = normalize(DataDF.MonthlyIncome_log)
#DataDF['NumberOfOpenCreditLinesAndLoans_log_normalized'] = normalize(DataDF.NumberOfOpenCreditLinesAndLoans_log)
#DataDF['NumberRealEstateLoansOrLines_log_normalized'] = normalize(DataDF.NumberRealEstateLoansOrLines_log)
#DataDF['NumberOfDependents_log_normalized'] = normalize(DataDF.NumberOfDependents_log)
#DataDF['NumberOfTimes60DaysLate_log_normalized'] = normalize(DataDF.NumberOfTimes60DaysLate_log)
#DataDF['NumberOfTimes90DaysLate_log_normalized'] = normalize(DataDF.NumberOfTimes90DaysLate_log)
DataDF.head().T
Understanding our variables is vital for analysis. I discussed my data people working in finance to gain domain knowledge to understand the variables and try to derive more features later.
# Description: Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits
# Lets see how many borrowers have Total balance twice their credit line.
# Looking at the original data for RevolvingUtilizationOfUnsecuredLines we notice that there are alot of outliers. The log solved this.
# Count_Revolving = len(DataDF.loc[DataDF["RevolvingUtilizationOfUnsecuredLines"] > 3, "RevolvingUtilizationOfUnsecuredLines"])
# print(Count_Revolving)
# Dropping observations
# DataDF.drop(DataDF[DataDF['RevolvingUtilizationOfUnsecuredLines'] > 3].index, inplace = True)
# Plot Histograms with a kernel density estimate Comparision between the variable and the log transformation
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Revolving Utilization Of Unsecured Lines", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "Revolving"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "Revolving"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "Revolving_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "Revolving_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
# Description: Age of borrower in years
# Plot Histograms with a kernel density estimate Comparision between the variable and the log transformation
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Age", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "age"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "age"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "age_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "age_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
# QQ plot to compare our variable with a theoretical normal distribution
sm.qqplot(DataDF['age'], dist=stats.norm, line = 'r')
plt.show()
Comment:
Looking at the above histogram for age we notice that we have a nearly normal distribution. Also, Q-Q plot displays this. As both distributions are close I opted for $age log$ in our analysis so all features follow the same transformation.
# Description: Number of times borrower has been 30-59 days past due but no worse in the last 2 years.
# Tha maximum total of this variable in two years is 24. Which is if the borrower defaulted and was late every single month.
Count_30DaysLate_Outliers = len(DataDF.loc[DataDF["T30DaysLate"] > 24])
print("Total number of borrowers above 24 times 30 days late is {}".format(Count_30DaysLate_Outliers))
# Plot Histograms with a kernel density estimate Comparision between the variable and the log transformation
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Number Of Times 30 Days Late", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "T30DaysLate"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "T30DaysLate"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "30Days_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "30Days_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
# Description: Monthly debt payments, alimony,living costs divided by monthy gross income.
# We can summarize this in an equation: Debt Ratio = Monthly debt payments / Monthly gross income
# Checking outliers - more than 4 std
Count_DebtRatio_Outliers = len(DataDF.loc[DataDF["DebtRatio"] > DataDF["DebtRatio"].quantile(0.999)])
print("Total number of borrowers with Debt Ratio higher than 4 Standard deviations is {}".format(Count_DebtRatio_Outliers))
# Plot Histograms with a kernel density estimate Comparision between the variable and the log transformation
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Debt Ratio", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "DebtRatio"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "DebtRatio"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "DebtRatio_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "DebtRatio_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
Comment:
Looking at the above graph $DebtRatio$ looks reasonable now, but as expected the ditrubution is not normal even after applying log transformation, but the log transformation helped with the outliers and it is less skewed.
# Description: Monthly income
# Plot Histograms with a kernel density estimate Comparision between the variable and the log transformation
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Monthly ", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "MonthlyIncome"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "MonthlyIncome"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "MonthlyIncome_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "MonthlyIncome_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
# Description: Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards).
# Plot Histograms with a kernel density estimate Comparision between the variable and the log transformation
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Number Of Open Credit Lines And Loans", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "OpenCredit"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "OpenCredit"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed & Normalized")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "OpenCredit_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "OpenCredit_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
# Description: Number of times borrower has been 90 days or more past due.
# Plot Histograms with a kernel density estimate Comparision between the variable and the log transformation
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Number Of Times 90 Days Late", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "T90DaysLate"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "T90DaysLate"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "90Days_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "90Days_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
# Description: Number of mortgage and real estate loans including home equity lines of credit.
# Plot Histograms with a kernel density estimate Comparision between the variable and the log transformation
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Number Real Estate Loans Or Lines", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "RealEstate"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "RealEstate"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "RealEstate_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "RealEstate_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
# Description: Number of times borrower has been 60-89 days past due but no worse in the last 2 years.
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Number Of Times 60 Days Late", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "T60DaysLate"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "T60DaysLate"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "60Days_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "60Days_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
# Description: Number of dependents in family excluding themselves (spouse, children etc.)
# Plot Histograms with a kernel density estimate Comparision between the variable and the log transformation
fig = plt.figure(figsize=(18,4))
title = fig.suptitle("Number Of Dependents", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.2)
ax1 = fig.add_subplot(1,2,1)
ax1.set_title("Orignal Data")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "NumberOfDependents"], bins=20, color="blue", label="Non Defaults", hist_kws={'alpha':.7}, kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "NumberOfDependents"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5}, kde_kws={'linewidth':1})
plt.legend();
ax2 = fig.add_subplot(1,2,2)
ax2.set_title("Histogram of Log transformed")
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 0, "Dependents_log"], bins=20, color="blue", hist_kws={'alpha':.7}, label="Non Defaults",kde_kws={'linewidth':1})
sns.distplot(DataDF.loc[DataDF['SeriousDlqin2yrs'] == 1, "Dependents_log"] , bins=20, color="green", label="Defaults", hist_kws={'alpha':.5},kde_kws={'linewidth':1})
plt.legend();
# After applying log to our features, all features had a better distrubution and are less skewed.
# In addition the log transformation took care of our outliers
# We will create a new dataset with the new features
NewDataDF = DataDF.drop(['RealEstate', 'T30DaysLate', 'DebtRatio', 'MonthlyIncome',
'OpenCredit', 'T90DaysLate', 'RealEstate',
'T60DaysLate', 'NumberOfDependents', 'age', 'Revolving'],axis=1,inplace=False)
NewDataDF.head().T
# Let's have a look at pair plots for our log transformed variables with target hue
g = sns.pairplot(NewDataDF, vars=["Revolving_log", "age_log", "30Days_log", "DebtRatio_log", "MonthlyIncome_log",
"OpenCredit_log", "RealEstate_log","60Days_log", "90Days_log"], hue="SeriousDlqin2yrs")
# Description: This is our response column or the y column which predicts if the borrower will default or not. The prediction is either 0 for No or 1 for Yes.
# Looking at the ratio in our response column SeriousDlqin2yrs
fig = plt.figure(figsize=(8,4))
default = sns.countplot(x = DataDF.SeriousDlqin2yrs ,palette='muted')
default.set_xlabel('Defaults: 0 (No), 1 (Yes)')
default.set_ylabel('Count')
default.set_title('Serious delinquency 2 yrs')
plt.show()
[N,Y] = DataDF.SeriousDlqin2yrs.value_counts()
print("Total Observation: %f" % (Y+N))
print("Number of people defaulted: %f" % (Y))
print('Default: ' + str((Y/(N+Y))*100) + ' %')
Comments:
# We are proceeding with caution to remove more observations from our data from methods below. However, it did help us in our analysis.
# But since we applied log transformation , there is no need
# Single Variables
# Finding outliers by finding the mean and standard deviation
mean = DataDF['T60DaysLate'].mean()
std = DataDF['T60DaysLate'].std()
upper = mean + 4*std
lower = mean - 4*std
print('Outliers upper limit: '+str(upper))
print('Outliers lower limit: '+str(lower))
#check how many outliers there are
isOutlier = ((DataDF['T60DaysLate']>upper)|(DataDF['T60DaysLate']<lower))
print('outlier count:',(isOutlier*1).sum())
# Multiple variable outlier detection
# Outliers Detection using Mahalanobis distance -
# Let me first set all my columns that i want to examine
cols = ['SeriousDlqin2yrs', 'Revolving_log', 'age_log',
'30Days_log', 'DebtRatio_log', 'MonthlyIncome_log',
'OpenCredit_log', '90Days_log',
'RealEstate_log', '60Days_log',
'Dependents_log']
# Generating the mean vector
mean_Vector = NewDataDF[cols].mean().values.reshape(1, len(cols))
# Calculating Mahalanobis distance for each row to the mean vector
mahalanobis_Distances = cdist(NewDataDF[cols], mean_Vector, metric='mahalanobis').flatten()
# Creating a scatter plot where we use a color mapping and use the mahalanobis distances above
plt.figure(num=8, figsize=(10, 6), linewidth=3)
sns.scatterplot(x=NewDataDF['Revolving_log'], y=NewDataDF['MonthlyIncome_log'], hue=mahalanobis_Distances, palette='plasma')
plt.suptitle('Outliers using Mahalanobis distance')
plt.xlabel('Revolving Utilization Of Unsecured Lines Log transformed')
plt.ylabel('Monthly Income Log transformed')
# since we transformed all our features to log transformation we are not removing any outliers from above techniques. However they were useful for our analysis.
# Drop outliers in age above 90 years old
DataDF.drop(DataDF[DataDF['age'] > 90].index, inplace = True)
# Plot data and a linear regression model fit.
grouped_by_age = DataDF.groupby('age')
ageDlqCount = grouped_by_age['SeriousDlqin2yrs'].aggregate([np.mean,'count']).reset_index()
ageDlqCount.columns =['Age','DlqMean','Count']
print(ageDlqCount)
sns.regplot(x='Age',y='DlqMean',data=ageDlqCount, marker="+")
plt.show()
Comment:
Defaults is negatively correlated in general with age. According the US studies they found that unsecured loans increased over the years for the younger generation with an increase debt level. Younger people may have poor borrowing decisions and are more likely to default than the older adults. Older adults are more financially aware and less likely to miss payments. For these reasons, loan rates are much higher for younger people as they are a riskier profile. (REF: BORROWING BEHAVIOUR)
Feature engineering is very useful for analysis therefore we will extract new features by combining features, aggregating and transforming existing features into new ones to so our models can identify more patterns. Since the number of dependents is important where a household expenses varies with this variable we will derive our new feature by dividing the income on the number of dependents. Second, we have an existing feature which is debt ratio, so by multiplying it by income we get a new feature which is monthly debt, which is the outstanding monthly debt for the borrower. Third, the monthly balance seems natural which can be calculated by deducting the debt from the income to see the borrower balance. We will derive these new features and look into how important they could be in our modelling.
# Income per member which the whole income divided by the dependents
NewDataDF['IncomeEachMember'] = NewDataDF['MonthlyIncome_log'] / (NewDataDF['Dependents_log'] + 1)
# Also we can create a monthly debt column which will be basically the the monthly income multiply by the debt ratio
NewDataDF['MonthlyDebt'] = NewDataDF['MonthlyIncome_log']*NewDataDF['DebtRatio_log']
# Monthly balance is the balance left after deducting the monthly debt from the the monthly income.
NewDataDF['MonthlyBalance'] = NewDataDF['MonthlyIncome_log']-NewDataDF['MonthlyDebt']
DataDF.columns
# Let's plot a heatmap to show the correlation of our target variable SeriousDlqin2yrs with the other features
corr = NewDataDF.corr()
corr_delinquency=corr[['SeriousDlqin2yrs']]
# Correlation sorting
corr_delinquency=corr_delinquency.sort_values(by ='SeriousDlqin2yrs',ascending=False)
plt.figure(figsize = (10,10))
import seaborn as sns
ax= sns.heatmap(corr_delinquency, vmin=-1, cmap='viridis', vmax=1, center=0, square=True, annot=True, fmt=".2f", )
bottom, top = ax.get_ylim()
ax.set_title("Correlation SeriousDlqin2yrs vs other features")
fig.tight_layout()
# Bug Fixed for top and bottom cropped
ax.set_ylim(bottom + 0.5, top - 0.5)
# The above is a great way to find correlation with the target variable. Let's see if we can confirm this in the modelling section.
Our goal here is to measure the importance of our features and perform feature selection. We will use two models, a parametric and non-parametric classifier. Parametrical models have parameters or assumptions regarding the data distribution, whereas boosting trees have parameters related with the algorithm itself, but they don't need assumptions about the data distribution. So, it's vital to model our data in different ways to identify which variables are important which can help us in early detection and maybe even improving the product or service. I will set the random state to ensure results comparability.
# split data into our features (X) and target output (y)
X = NewDataDF.drop(['SeriousDlqin2yrs'],axis=1,inplace=False)
y = NewDataDF['SeriousDlqin2yrs']
# Split our data into 80% training set and 20% testing.
# We will define random state to number 123, so we can get the same output everytime
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y, test_size=0.2, random_state=123)
We will use logistic regression as it's well suited for binary classification problems. We will use logistic regression model techniques to learn more about the features and coefficients in pursue to answer our research questions. The coefficients of the logistic regression algorithm will be obtained from our training data by using maximum likelihood estimation.
import statsmodels.api as sm
ModelLogit = sm.Logit(y,X)
result=ModelLogit.fit(maxiter=1000)
print(result.summary())
did_converge = result.mle_retvals["converged"]
We will fit a model below using Liblinear. It's an open source library and supports logistic regression Our experiments show its very efficient with large scale linear classification data sets.
# Fit a logistic regression model with liblinear and make predictions on test set
ModelLog = LogisticRegression(solver='liblinear', random_state=123)
ModelLog.fit(Xtrain,Ytrain)
# Use cross val score method to get accuracy of model
#score = ModelLogSci.score(Xtest, Ytest)
#print(score)
scores = cross_val_score(ModelLog,X,y,cv=10)
print('Averge score across all folds is: ',scores.mean()*100,'%')
# Predictions on test data (We opted for AUC score comparision since our data has imbalanced classes)
# Accuracy
Ypred_ModelLog = ModelLog.predict(Xtest)
#accuracy_ModelLog = accuracy_score(Ytest, Ypred_ModelLog)
#print("Logistic regression test accuracy: %.3f" % (accuracy_ModelLog * 100.0))
# Predictions on test data (We opted for AUC score comparision since our data has imbalanced classes)
# AUC
log_pred = ModelLog.predict_proba(Xtest)[:,1]
roc_log = roc_auc_score(Ytest,log_pred)
print("Logistic regression test AUC: %.3f" % (roc_log))
# Plot our coefficients
FeatureC = pd.DataFrame(list(Xtrain.columns),columns=['features'])
FeatureC['coefficients'] = pd.DataFrame(list(ModelLog.coef_[0]),columns=['coefficients'])
FeatureC['coefficients'] = FeatureC['coefficients'].apply(lambda x: float(format(x, 'f')))
FeatureC.sort_values(by='coefficients',ascending=True).plot(x='features',y='coefficients', kind='barh')
print(FeatureC)
confusionMatrix = confusion_matrix(Ytest, Ypred_ModelLog)
print(confusionMatrix)
Comment:
From sklearn.linear_model.LogisticRegression documentation, the features coefficients in the decision function and are log odds ratios.
The Output from stats models suggests that highest positive coefficient is the $90Days_log$ feature, followed by $30Days_log$, $60Days_log$, $Revolving_log$, $RealEstate_log$ which are all positive which means that the probability of financial distress increases with the increase of these features. This is also confirmed from the scikit-learn graph.
$age_log$ is a negative coefficient which suggests previous findings in this analysis that financial distress decreases by age.
It's worth noting that the coefficient values are not identical in both outputs as the latter uses a kind of regularization and different optimizations.
The accuracy (93.5%) using the cross validation is high and generalized well using cross fold validation.
TP = 27646, FP = 1772, FN = 190, TN = 270. However, the Recall is very low as the data is imbalanced.
XGBoost is used with a decision-tree as a base learner that uses a gradient boosting framework. The algorithm has a built-in feature importance that we would like to use to explore the importance of our features. Here will see how to measure the importance of our features. XGBoost can automatically provide estimates of feature importance from a trained model.
# Fit the model into a xgb classifier
modelXGB = XGBClassifier()
modelXGB.fit(X, y)
# Explore feature importance
print(modelXGB.feature_importances_)
# Feature importance plot
plot_importance(modelXGB)
plt.show()
# Predictions on test data (We opted for AUC score comparision since our data has imbalanced classes)
# Accuracy
#Ypred = modelXGB.predict(Xtest)
#accuracy = accuracy_score(Ytest, Ypred)
#print("XGBoost test accuracy: %.3f" % (accuracy * 100.0))
# Predictions on test data (We opted for AUC score comparision since our data has imbalanced classes)
# AUC
Ypred_AUC = modelXGB.predict_proba(Xtest)[:,1]
roc_XGB = roc_auc_score(Ytest,Ypred_AUC)
print("XGBoost test AUC: %.3f" % (roc_XGB))
Comment:
The graph above shows us the feature importance. It counts the number of times each feature is split on across all trees in the model. Lastly, it visualizes the result as a bar graph, with the features ordered according to how many times they appeared. We can observe that $Revolving_log$ is the most important. But in logistic regression the $90DaysLog$ was the most important.
Now, we know which features are important, but how many features should we select from the above? See Feature selection.
plt.figure(figsize=(10,8))
sns.heatmap(NewDataDF.corr(), annot=True, fmt=".2f", cmap='viridis')
plt.title('Pearson Correlation Plot')
plt.xticks(rotation=55)
# Bug Fixed for top and bottom cropped
b, t = plt.ylim() # discover the values for bottom and top
b += 0.4
t -= 0.4
plt.ylim(b, t)
plt.show()
Comment:
From the Pearson plot above we notice that $MonthlyDebt$ is very high correlated with $MonthlyBalance$, $Debtratio_log$
with $MonthlyBalance$ and $Dependents_log$ with $IncomeEachMember$.
It's advisable to remove them but we will need to complete our analysis first.
# split data into our features (X) and target output (Y)
X = NewDataDF.drop(['SeriousDlqin2yrs'],axis=1,inplace=False)
y = NewDataDF['SeriousDlqin2yrs']
# Split our data into 80% training set and 20% testing.
# We will define random state to number 123, so we can get tne same output everytime
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y, test_size=0.2, random_state=123)
# Parameter Tuning. We started with a Random search to get rough paramters
# Then we used Grid search for a range around the random search results and got the best paramters with max AUC
# We will leave the selected paramters so we don't have a long process evey time we refresh our kernel.
# Random Search Experiment
# randomized_model = RandomizedSearchCV(estimator=pipeline,param_distributions=gbm_param_grid,n_iter=2, scoring='roc_auc', cv=2, verbose=1)
# Fit the estimator
# randomized_model.fit(X, y)
# Parameter grid
# We selected the following paramters from hundreds of paramters combinations
parameters_grid = {
'clf__loss' : ['deviance'], # 'exponential'
'clf__learning_rate': [0.1],
'clf__max_depth': [3],
'clf__n_estimators': [160],
'clf__subsample':[0.9],
'clf__scale_pos_weight':[7]} # Deals with imbalancmenet
# Create pipeline (Very usefull when using hot enoding and label encoding for categorical features)
pipeline = Pipeline([
("clf", xgb.XGBClassifier())])
# Grid search to tune our hyperparamters with 10 fold cross validation
XGB_Grid = GridSearchCV(param_grid=parameters_grid,
estimator=pipeline,
scoring="roc_auc", #"roc_auc" for AUC & "accuracy" for accuracy
cv=2, verbose=1) # Return cv to 10 after testing
# Fitting our model
XGB_Grid.fit(X,y)
# Computing metrics (AUC on cross validation set, not to compare with the test)
print(XGB_Grid.best_score_)
print(XGB_Grid.best_params_)
pipeline.get_params().keys()
# Fit the best parameters in our XGB boost model
# XGBoost can deal with imbalanced data with ‘scale_pos_weight’ parameter, which can be calculated by sum (negative instances) / sum (positive instances).
#The parameter penalizes negative cases (0) and assign a higher weight to positive cases (1).
# The best scale was 7 that gave the best AUC but the accuracy fell drastically. So i'll keep it 1 for analysis & comparision reasons.
XGB_Best_Model = XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, gamma=0, learning_rate=0.1,
loss='deviance', max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None,
n_estimators=160, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=123,
reg_alpha=0, reg_lambda=1, scale_pos_weight=7,
seed=None, silent=None, subsample=0.9,
verbosity=1)
XGB_Best_Model.fit(X, y)
# Explore feature importance
print(XGB_Best_Model.feature_importances_)
# Feature importance plot
plot_importance(XGB_Best_Model)
plt.show()
# Predictions on test data (We opted for AUC score comparision since our data has imbalanced classes)
# Accuracy
#Ypred = XGB_Best_Model.predict(Xtest)
#accuracy = accuracy_score(Ytest, Ypred)
#print("Accuracy:%.3f" % (accuracy * 100.0))
# Predictions on test data (We opted for AUC score comparision since our data has imbalanced classes)
# AUC
Ypred_AUC = XGB_Best_Model.predict_proba(Xtest)[:,1]
roc_XGB = roc_auc_score(Ytest,Ypred_AUC)
print("XGBoost test AUC: %.3f" % (roc_XGB))
# Feature selection based on feature importance scores
# AUC is calcualted one each subset of features n
threshold = sort(XGB_Best_Model.feature_importances_)
for i in threshold:
# select features using threshold
ModelSelected = SelectFromModel(XGB_Best_Model, threshold=i, prefit=True)
XtrainSelected = ModelSelected.transform(X)
# train model
selection_model = XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, gamma=0, learning_rate=0.1,
loss='deviance', max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None,
n_estimators=160, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=123,
reg_alpha=0, reg_lambda=1, scale_pos_weight=7,
seed=None, silent=None, subsample=0.9,
verbosity=1)
selection_model.fit(XtrainSelected, y)
# Testing our test set and evaluating our model
XtestSelected = ModelSelected.transform(Xtest)
y_pred = selection_model.predict(XtestSelected)
y_proba = selection_model.predict_proba(XtestSelected)[:,1]
roc = roc_auc_score(Ytest,y_proba)
accuracy = accuracy_score(Ytest, y_pred)
print("Threshold=%.3f, n of features=%d, AUC:%.3f, Accuracy:%.2f%%" % (i, XtrainSelected.shape[1], roc, accuracy*100.0))
Comment:
The performance of the model started with AUC 0.875 with 13 features at threshold 0.013 and stayed the same at feature (n=10), then generally decreases with the number of selected features. So our goal is to have the best AUC 0.875 with the least number of features (n=10), therefore we selected the top 10 features from the chart above. We can say that we successfully selected our top important features. The two new features MonthlyDebt and MonthlBalance performed good and were selected in the top 10 features.
Based on the above we opted for the top 10 features
# Our final Data dataframe. Only top selected 10 features.
FinalDataDF = NewDataDF.drop(['IncomeEachMember', 'Dependents_log', 'RealEstate_log'],axis=1,inplace=False)
FinalDataDF.columns
# Train and test split on the new dataframe with our top selection.
XFinal = FinalDataDF.drop(['SeriousDlqin2yrs'],axis=1,inplace=False)
yFinal = FinalDataDF['SeriousDlqin2yrs']
Xtrain, Xtest, Ytrain, Ytest = train_test_split(XFinal,yFinal, test_size=0.2, random_state=123)
# Train our final model
# We can increase the accuracy for true positives or negatives by increasing scale_pos_weight to handle the imbalanaced class
XGB_Final_Model = XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, gamma=0, learning_rate=0.1,
loss='deviance', max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None,
n_estimators=160, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=123,
reg_alpha=0, reg_lambda=1, scale_pos_weight=7,
seed=None, silent=None, subsample=0.9,
verbosity=1)
XGB_Final_Model.fit(XFinal, yFinal)
# confusion Matrix
# Function for confusion matrix, normalized confusion matrix & sklearn classification report
from sklearn.metrics import confusion_matrix
def ConfusionMatrix(yTrue, yPred, title = 'Confusion Matrix', cmap=plt.cm.Blues):
print(classification_report(Ytest, y_pred));
cm = confusion_matrix(yTrue, yPred)
def ConfusionMatrixPlot(cm, title = 'Confusion Matrix', cmap=plt.cm.Blues):
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap="Blues");
# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
# Text Position
b, t = plt.ylim()
b += 0.4
t -= 0.4
plt.ylim(b, t)
print (cm)
ConfusionMatrixPlot(cm=cm)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('\n Normalized confusion matrix \n')
print(cm_normalized)
plt.figure()
ConfusionMatrixPlot(cm_normalized, title='Normalized Confusion Matrix ')
# Predictions of final model an test set
yPredFinal = XGB_Final_Model.predict(Xtest)
ConfusionMatrix(Ytest, yPredFinal)
Comment:
The matrix above shows that the predictions for the smaller class is better (recall).
Scale parameter performed much better than before and dealt with the imbalancement.
sns.set(rc={'figure.figsize':(6,6)})
# Plotting ROC curve with AUC display
y_pred_proba = XGB_Final_Model.predict_proba(Xtest)[::,1]
y_proba = selection_model.predict_proba(XtestSelected)[:,1]
fpr, tpr, _ = metrics.roc_curve(Ytest, y_pred_proba)
auc = metrics.roc_auc_score(Ytest, y_pred_proba)
plt.plot(fpr,tpr,label="AUC = %.2f" % (auc))
plt.legend(loc=5)
plt.show()
Comment:
The AUC is 0.87 which is very high so we can trust the results.
ROC AUC metric does not depend on a strict discrete classification, but on the predicted class probabilities,
that's why is more applicable to our type of problem.
import xgboost as xgb
xgb.plot_tree(XGB_Final_Model, num_trees=0)
fig = plt.gcf()
fig.set_size_inches(100, 150)
fig.savefig('tree1.png')
Comment:
The above is a tree plot for our final XGB model explanation. It is great to explain any predictions
by following the logic. Could be messy if there were more than one tree.
Model explainability today is very important in data science. SHAP is a solid library for helping provide these explanations. We will try to explain our final XGB model by using this library and how it uses features in predicting.
# Shap explainers
explainerXGB = shap.TreeExplainer(XGB_Final_Model)
shap_values_XGB_test = explainerXGB.shap_values(Xtest)
shap_values_XGB_train = explainerXGB.shap_values(Xtrain)
df_shap_XGB_test = pd.DataFrame(shap_values_XGB_test, columns=Xtest.columns.values)
df_shap_XGB_train = pd.DataFrame(shap_values_XGB_train, columns=Xtrain.columns.values)
j = 0
# initialize js for SHAP
shap.initjs()
# force plot
shap.force_plot(explainerXGB.expected_value, shap_values_XGB_test[j], Xtest.iloc[[j]])
Comment:
The graph above is called a force plot.
It shows features contributing to push the prediction from the base value.
The base value is the average model output over the training dataset we passed.
Features pushing the prediction higher are shown in red. Features pushing it lower appear in blue.
The record we are testing from the test set has a lower than average predicted value at -1.52 compared to -0.78. $MonthlyIncome_log$ is 8 for this record.
This pushes the predicted value lower. The details here explain why an individual prediction was made on a local level.
This is important to help us include recommendations with other decision factors.
# Variable importance graphs
# SHAP provides a theoretically sound method for evaluating variable importance
shap.summary_plot(shap_values_XGB_train, Xtrain, plot_type="bar")
shap.summary_plot(shap_values_XGB_train, Xtrain)
Comment:
Above is a summary plot showing the SHAP values for every instance from the training dataset, which can lead to a better understanding of overall patterns.
X = FinalDataDF.drop(['SeriousDlqin2yrs'],axis=1,inplace=False)
y = FinalDataDF['SeriousDlqin2yrs']
sns.set(rc={'figure.figsize':(12,12)})
sns.set_style("whitegrid")
# PCA n_components=2
pca = PCA(n_components=2)
# Scaling the data
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
XTransformed = scaler.fit_transform(X)
pcaDF = pd.DataFrame(XTransformed)
pcaFit = pca.fit_transform(pcaDF)
result = pd.DataFrame(pcaFit, columns=['PCA1', 'PCA2'])
result['class'] = y
ax = sns.lmplot(data = result, x='PCA1', y='PCA2',hue='class', height=10, aspect=4, fit_reg=False, palette='Set1',
scatter_kws={'alpha': 0.2})
fig = plt.gcf()
fig.set_size_inches(12,6)
pca.explained_variance_ratio_
Comment:
In the PCA plot there is a significant overlap, however defaults concentrated on the line that are close PCA2 = 0 & between PCA2 0 to 5 at PCA 1 0 to -2
In our case we tried predictions but had much lower scores with PCA so we will not use in our analysis.
The variance ratio is as following:
First component 30% of the variance
Second component 22% of the variance
Total is 52%
sns.set(rc={'figure.figsize':(12,12)})
sns.set_style("whitegrid")
tsne = TSNE(early_exaggeration=7.0, learning_rate=100.0, n_iter=250, metric='cosine', perplexity=20, verbose=2)
n = 75000
Xs, Xn, ys, yn = train_test_split(X, y, test_size=(1-(n/float(len(FinalDataDF)))))
tsne_transformed = tsne.fit_transform(Xs)
tsneDF = pd.DataFrame(tsne_transformed, columns=['1', '2'])
ys.index = range(0,len(tsneDF))
tsneDF['class'] = ys
sns.lmplot(x='1', y='2',data=tsneDF,hue='class',height=20,aspect=4,fit_reg=False,palette="Set1",scatter_kws={'alpha':0.7})
Comment:
In the t-SNE plot, there is a high concentration of default predictions on the some of the rings on the right side.
Even so, PCA & T-sne didn't help us answer our questions but it was helpful to understand our dataset.
Markdown word count: 1509